Preliminary Data Exploration

  • Format Version : 1.23
  • Take Name : Take 2019-09-30 03.59.35 PM
  • Take Notes : Unnamed: 5
  • Capture Frame Rate : 120.000000
  • Export Frame Rate : 120.000000.1
  • Capture Start Time : 2019-09-30 03.59.36.607 下午
  • Total Frames in Take : 4647
  • Total Exported Frames : 4647.1
  • Rotation Type : Quaternion
  • Length Units : Meters
  • Coordinate Space : Global

DataExplorationimage.png

In [1]:
import pandas as pd
In [2]:
# Only take the fifth column as the data header.
data = pd.read_csv("./Marker_records_30s.csv", header=5)
In [3]:
data.shape
Out[3]:
(4647, 785)
In [4]:
data.head()
Out[4]:
Frame Time (Seconds) X Y Z X.1 Y.1 Z.1 X.2 Y.2 ... Z.257 X.258 Y.258 Z.258 X.259 Y.259 Z.259 X.260 Y.260 Z.260
0 0 0.000000 -1.196379 -0.315348 0.384042 -0.878003 -0.743350 0.514558 -0.778298 -0.008184 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 1 0.008333 -1.196096 -0.315403 0.383891 -0.877604 -0.743369 0.514271 -0.777598 -0.008591 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 2 0.016667 -1.195817 -0.315464 0.383751 -0.877154 -0.743512 0.513873 -0.776924 -0.008470 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 3 0.025000 -1.195564 -0.315499 0.383591 -0.876610 -0.743705 0.513720 -0.776570 -0.008560 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 4 0.033333 -1.195335 -0.315493 0.383997 -0.876310 -0.743700 0.513509 -0.776335 -0.008627 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 785 columns

In [96]:
data.describe()
Out[96]:
Frame Time (Seconds) X Y Z X.1 Y.1 Z.1 X.2 Y.2 ... Z.257 X.258 Y.258 Z.258 X.259 Y.259 Z.259 X.260 Y.260 Z.260
count 4647.00000 4647.000000 4614.000000 4614.000000 4614.000000 3729.000000 3729.000000 3729.000000 1201.000000 1201.000000 ... 7.000000 4.000000 4.000000 4.000000 3.000000 3.000000 3.000000 2.000000 2.000000 2.000000
mean 2323.00000 19.358333 -0.228103 0.379084 0.357135 -0.879534 -0.069736 0.073489 -0.132350 0.092492 ... 0.291163 -0.685939 -0.216025 0.299354 -0.203669 -0.517229 0.647039 -0.302500 0.180483 0.216235
std 1341.61768 11.180147 0.401252 0.395690 0.191572 0.239064 0.224740 0.283032 0.423867 0.213296 ... 0.003729 0.000844 0.007152 0.004998 0.001457 0.003046 0.001889 0.001527 0.003149 0.007311
min 0.00000 0.000000 -1.196379 -0.316376 -0.133012 -1.490619 -0.743902 -0.688646 -0.778298 -0.179632 ... 0.286660 -0.687022 -0.226627 0.295681 -0.205309 -0.520418 0.645022 -0.303580 0.178257 0.211066
25% 1161.50000 9.679167 -0.587753 -0.054132 0.257400 -1.006864 -0.076917 -0.089627 -0.624299 -0.031019 ... 0.288637 -0.686201 -0.216804 0.296202 -0.204242 -0.518670 0.646176 -0.303040 0.179370 0.213651
50% 2323.00000 19.358333 -0.118967 0.518624 0.324132 -0.837575 -0.030274 0.057554 0.123882 -0.000867 ... 0.290201 -0.685887 -0.213245 0.297578 -0.203175 -0.516921 0.647330 -0.302500 0.180483 0.216235
75% 3484.50000 29.037500 0.106422 0.672892 0.405290 -0.691696 0.027659 0.227626 0.232128 0.159192 ... 0.293602 -0.685626 -0.212467 0.300731 -0.202850 -0.515635 0.648048 -0.301960 0.181597 0.218820
max 4646.00000 38.716667 0.371130 0.986451 0.873574 -0.394719 0.476907 0.667603 0.328114 0.675931 ... 0.296801 -0.684961 -0.210980 0.306582 -0.202524 -0.514349 0.648766 -0.301420 0.182710 0.221405

8 rows × 785 columns

In [97]:
data.isnull().sum()
Out[97]:
Frame                0
Time (Seconds)       0
X                   33
Y                   33
Z                   33
X.1                918
Y.1                918
Z.1                918
X.2               3446
Y.2               3446
Z.2               3446
X.3               1112
Y.3               1112
Z.3               1112
X.4               1686
Y.4               1686
Z.4               1686
X.5                  8
Y.5                  8
Z.5                  8
X.6                  1
Y.6                  1
Z.6                  1
X.7               1637
Y.7               1637
Z.7               1637
X.8               3072
Y.8               3072
Z.8               3072
X.9                  7
                  ... 
X.251             4645
Y.251             4645
Z.251             4645
X.252             4640
Y.252             4640
Z.252             4640
X.253             4534
Y.253             4534
Z.253             4534
X.254             4639
Y.254             4639
Z.254             4639
X.255             4633
Y.255             4633
Z.255             4633
X.256             4638
Y.256             4638
Z.256             4638
X.257             4640
Y.257             4640
Z.257             4640
X.258             4643
Y.258             4643
Z.258             4643
X.259             4644
Y.259             4644
Z.259             4644
X.260             4645
Y.260             4645
Z.260             4645
Length: 785, dtype: int64

Data Summary

  • 4647 frames in total and having 785 columns
  • Lots of columns got missing value
In [7]:
import numpy as np
In [8]:
def getMarkersPosition(lst):
    pos = {
        'f': 0,
        't': 0.0,
        'x': [],
        'y': [],
        'z': []
    }
    for i in data.columns:
        if 'Frame' in i:
            pos['f'] = list(lst[i])[0]
            
        if 'Time' in i:
            pos['t'] = list(lst[i])[0]
            
        if 'X' in i and not np.isnan(
            list(lst[i])[0]
        ):
            pos['x'].append(
                list(lst[i])[0]
            )
        if 'Y' in i and not np.isnan(
            list(lst[i])[0]
        ):
            pos['y'].append(
                list(lst[i])[0]
            )
        if 'Z' in i and not np.isnan(
            list(lst[i])[0]
        ):
            pos['z'].append(
            list(lst[i])[0]
            )
            
    return pos
In [103]:
import plotly.graph_objects as go
import plotly.io as pio
import plotly.offline as pltoff

# pio.renderers.default = "browser"
pio.renderers.default = "notebook"

def plotMarkers(pos):
    title_txt = "{0} Markers Captureed in {1} Frame at {2} second".format(len(pos['x']), pos['f'], pos['t'])
#     print(title_txt)
    x, y, z = pos['x'], pos['y'], pos['z']
    fig = go.Figure(data=[go.Scatter3d(
        x=x,
        y=y,
        z=z,
        mode='markers',
        marker=dict(
            size=3,
            color='purple',                # set color to an array/list of desired values
    #         colorscale='Viridis',   # choose a colorscale
            opacity=0.8
        )
    )])

    # tight layout
#     fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
    fig.update_layout(title=title_txt,          
                xaxis_title='Month',
                yaxis_title='Temperature (degrees F)')
    fig.show()
In [104]:
import random
rd = random.randint(1, data.shape[0])
f = getMarkersPosition(data[rd: rd+1])
plotMarkers(f)
In [105]:
rd = random.randint(1, data.shape[0])
f = getMarkersPosition(data[rd: rd+1])
plotMarkers(f)
In [106]:
rd = random.randint(1, data.shape[0])
f = getMarkersPosition(data[rd: rd+1])
plotMarkers(f)
In [35]:
Marker_num_collection = []
# for i in range(1, 50): # Test
for i in range(1, data.shape[0]):
    pos = getMarkersPosition(data[i: i+1])
    Marker_num = len(pos['x'])
    f_num = pos['f']
    Marker_num_collection.append(Marker_num)
#     print(Marker_Num, f_num)
    if f_num in [i*100 for i in range(46)]:
        print("Processed {:.2}%".format(f_num/data.shape[0]))
Processed 2.2%
Processed 4.3%
Processed 6.5%
Processed 8.6%
Processed 1.1e+01%
Processed 1.3e+01%
Processed 1.5e+01%
Processed 1.7e+01%
Processed 1.9e+01%
Processed 2.2e+01%
Processed 2.4e+01%
Processed 2.6e+01%
Processed 2.8e+01%
Processed 3e+01%
Processed 3.2e+01%
Processed 3.4e+01%
Processed 3.7e+01%
Processed 3.9e+01%
Processed 4.1e+01%
Processed 4.3e+01%
Processed 4.5e+01%
Processed 4.7e+01%
Processed 4.9e+01%
Processed 5.2e+01%
Processed 5.4e+01%
Processed 5.6e+01%
Processed 5.8e+01%
Processed 6e+01%
Processed 6.2e+01%
Processed 6.5e+01%
Processed 6.7e+01%
Processed 6.9e+01%
Processed 7.1e+01%
Processed 7.3e+01%
Processed 7.5e+01%
Processed 7.7e+01%
Processed 8e+01%
Processed 8.2e+01%
Processed 8.4e+01%
Processed 8.6e+01%
Processed 8.8e+01%
Processed 9e+01%
Processed 9.3e+01%
Processed 9.5e+01%
Processed 9.7e+01%
In [38]:
from collections import Counter
c = Counter(Marker_num_collection)
for k,v in sorted(c.items()):
    print(k,v)
13 1
14 26
15 640
16 2410
17 1308
18 232
19 24
20 5
In [62]:
sorted(c.items())
k = []
v = []
for i in range(len(sorted(c.items()))):
    k.append(str(sorted(c.items())[i][0])+"_Markers")
    v.append(sorted(c.items())[i][1])
In [68]:
pio.renderers.default = "notebook"
fig = go.Figure(data=[go.Pie(labels=k, values=v)])
fig.update_layout(title='Frame w/ Different Markers %')
fig.show()

Data Preproccessing

  • Only take the frame with 16 markers into consideration

\begin{equation} S_{48*2410} =\begin{bmatrix} x_{1}^{1} & x_2^{1} & ... & x_{16}^{1} & y_1^{1} & y_2^{1} & ... & y_{16}^{1} & z_1^{1} & z_2^{1} & ... & z_{16}^{1} \\ x_{1}^{2} & x_2^{2} & ... & x_{16}^{2} & y_1^{2} & y_2^{2} & ... & y_{16}^{2} & z_1^{2} & z_2^{2} & ... & z_{16}^{2} \\  \vdots   & \vdots & \ddots  & \vdots &  \vdots   & \vdots & \ddots  & \vdots &  \vdots   & \vdots & \ddots  & \vdots  \\  x_{1}^{2410} & x_2^{2410} & ... & x_{16}^{2410} & y_1^{2410} & y_2^{2410} & ... & y_{16}^{2410} & z_1^{2410} & z_2^{2410} & ... & z_{16}^{2410} \\ \end{bmatrix} \end{equation}

In [78]:
lst_16m = []
# for i in range(1, 50): # Test
for i in range(1, data.shape[0]):
    pos = getMarkersPosition(data[i: i+1])
    Marker_num = len(pos['x'])
    f_num = pos['f']
    if Marker_num == 16:
        line = pos['x'] + pos['y'] + pos['z']
        lst_16m.append(line)
    if f_num in [i*100 for i in range(46)]:
        print("Processed {:.2%}".format(f_num/data.shape[0]))
Processed 2.15%
Processed 4.30%
Processed 6.46%
Processed 8.61%
Processed 10.76%
Processed 12.91%
Processed 15.06%
Processed 17.22%
Processed 19.37%
Processed 21.52%
Processed 23.67%
Processed 25.82%
Processed 27.98%
Processed 30.13%
Processed 32.28%
Processed 34.43%
Processed 36.58%
Processed 38.73%
Processed 40.89%
Processed 43.04%
Processed 45.19%
Processed 47.34%
Processed 49.49%
Processed 51.65%
Processed 53.80%
Processed 55.95%
Processed 58.10%
Processed 60.25%
Processed 62.41%
Processed 64.56%
Processed 66.71%
Processed 68.86%
Processed 71.01%
Processed 73.17%
Processed 75.32%
Processed 77.47%
Processed 79.62%
Processed 81.77%
Processed 83.93%
Processed 86.08%
Processed 88.23%
Processed 90.38%
Processed 92.53%
Processed 94.68%
Processed 96.84%
In [80]:
X = np.array(lst_16m)
In [81]:
X.shape
Out[81]:
(2410, 48)
In [84]:
X = X.T
In [85]:
X.shape
Out[85]:
(48, 2410)
In [86]:
from sklearn.decomposition import PCA
from sklearn import preprocessing
X_scale = preprocessing.scale(X, axis = 1)
In [87]:
X_scale.std(axis = 1)
Out[87]:
array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
In [88]:
X_scale.shape
Out[88]:
(48, 2410)
In [91]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
pca = PCA().fit(X_scale)

#Plotting the Cumulative Summation of the Explained Variance
plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_)[:], 'o-', c='#663399', alpha=.5)
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Pulsar Dataset Explained Variance')
plt.show()
In [92]:
n_lst =[ i*10 for i in range(1, 5)]
count =0
for n in n_lst:
    
    plt.figure()
    plt.plot(np.cumsum(pca.explained_variance_ratio_)[:n], 'o-', c='#663399', alpha=.5)
    plt.xlabel('Number of Components')
    plt.ylabel('Variance (%)') #for each component
    plt.title('Test Dataset Explained Variance')
    plt.show()
In [94]:
index = 0
for i in np.cumsum(pca.explained_variance_ratio_)[:20]:
    index += 1
    print("Dimension Size: {}, Variance: {}".format(index, i ))
Dimension Size: 1, Variance: 0.291443116341386
Dimension Size: 2, Variance: 0.4967348933209013
Dimension Size: 3, Variance: 0.6161356444739454
Dimension Size: 4, Variance: 0.7088587484481813
Dimension Size: 5, Variance: 0.7689202786446919
Dimension Size: 6, Variance: 0.8155543579162801
Dimension Size: 7, Variance: 0.8554841705868452
Dimension Size: 8, Variance: 0.8801448176084538
Dimension Size: 9, Variance: 0.9011227415027564
Dimension Size: 10, Variance: 0.9190314780537784
Dimension Size: 11, Variance: 0.932374751137278
Dimension Size: 12, Variance: 0.9430731877972706
Dimension Size: 13, Variance: 0.952700457158455
Dimension Size: 14, Variance: 0.9605340946879282
Dimension Size: 15, Variance: 0.9673674716278177
Dimension Size: 16, Variance: 0.9737755173891942
Dimension Size: 17, Variance: 0.9782182809920628
Dimension Size: 18, Variance: 0.9813571203517318
Dimension Size: 19, Variance: 0.9842142243509908
Dimension Size: 20, Variance: 0.9864179253772055
In [95]:
import seaborn as sns
sns.set()
sns.despine(left=True)

fig, axes = plt.subplots(2, 2, figsize=(15, 10), gridspec_kw=dict(hspace=.4, wspace=.3))
title_settings={'fontsize':16}
subtitles=['Inversed samples with {} components']*3

# Plot Heatmap 1
ax = sns.heatmap(X_scale, cbar=False, ax=axes[0, 0])
ax.set_title('Original samples', **title_settings)

# Plot Heatmap 2, 3, and 4
plot_ind=[[0,1],[1,0],[1,1]]
n_components=[1,2,3,4,5,6]
for nc, title, i in zip(n_components, subtitles, plot_ind):
    pca = PCA(n_components=nc)
    PC = pca.fit_transform(X_scale)
    inversed = pca.inverse_transform(PC)
    ax = sns.heatmap(inversed, cbar=False, ax=axes[i[0],i[1]])
    ax.set_title(title.format(nc), **title_settings)
<Figure size 432x288 with 0 Axes>